In order to start the modelling phase, our team decided to modify the already existing preprocessing steps, so that final dataset is more suited for the needs of the project.
%reload_ext pretty_jupyter
import os
import pickle
import numpy as np
import pandas as pd
from scipy import stats
import scipy.signal as scisig
import scipy.stats
import cvxEDA
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
Configurations¶
In comparison to the original script, our version introduces sliding window with overlaps. This way we canhave more data points in the final dataset as well as probability of missing some important information is minimized. In this version, the following was changed:
- Window length was increased to 45 seconds;
- Interval was changed to 15 seconds.
# -.-|m { input_fold: show }
WINDOW_IN_SECONDS = 30
WINDOW_ADDITION = 15
WINDOW_INTERVAL = 15
WINDOW_LENGTH = WINDOW_IN_SECONDS + WINDOW_ADDITION
data_folder = "../data"
INPUT_FOLDER = f"{data_folder}/01_raw/WESAD"
OUTPUT_FOLDER = f"{data_folder}/03_primary/WESAD"
Functions and Class Definition¶
First, we set up global variables and constants.
# E4 (wrist) Sampling Frequencies
fs_dict = {'ACC': 32, 'BVP': 64, 'EDA': 4, 'TEMP': 4, 'label': 700, 'Resp': 700}
label_dict = {'baseline': 1, 'stress': 2, 'amusement': 0}
int_to_label = {1: 'baseline', 2: 'stress', 0: 'amusement'}
feat_names = None
# savePath = 'data'
subject_feature_path = '/subject_feats'
if not os.path.exists(OUTPUT_FOLDER):
os.makedirs(OUTPUT_FOLDER)
if not os.path.exists(OUTPUT_FOLDER + subject_feature_path):
os.makedirs(OUTPUT_FOLDER + subject_feature_path)
Next, we create class to read and parse whole dataset.
class SubjectData:
def __init__(self, main_path, subject_number):
self.name = f'S{subject_number}'
self.subject_keys = ['signal', 'label', 'subject']
self.signal_keys = ['chest', 'wrist']
self.chest_keys = ['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp']
self.wrist_keys = ['ACC', 'BVP', 'EDA', 'TEMP']
with open(os.path.join(main_path, self.name) + '/' + self.name + '.pkl', 'rb') as file:
self.data = pickle.load(file, encoding='latin1')
self.labels = self.data['label']
def get_wrist_data(self):
data = self.data['signal']['wrist']
data.update({'Resp': self.data['signal']['chest']['Resp']})
return data
def get_chest_data(self):
return self.data['signal']['chest']
def extract_features(self): # only wrist
results = \
{
key: get_statistics(self.get_wrist_data()[key].flatten(), self.labels, key)
for key in self.wrist_keys
}
return results
We define functions which help extracting various features out of EDA, Temperature and Acceleration data.
def eda_stats(y):
Fs = fs_dict['EDA']
yn = (y - y.mean()) / y.std()
[r, p, t, l, d, e, obj] = cvxEDA.cvxEDA(yn, 1. / Fs)
return [r, p, t, l, d, e, obj]
def butter_lowpass(cutoff, fs, order=5):
# Filtering Helper functions
nyq = 0.5 * fs
normal_cutoff = cutoff / nyq
b, a = scisig.butter(order, normal_cutoff, btype='low', analog=False)
return b, a
def butter_lowpass_filter(data, cutoff, fs, order=5):
# Filtering Helper functions
b, a = butter_lowpass(cutoff, fs, order=order)
y = scisig.lfilter(b, a, data)
return y
def get_slope(series):
linreg = scipy.stats.linregress(np.arange(len(series)), series )
slope = linreg[0]
return slope
def get_window_stats(data, label=-1):
mean_features = np.mean(data)
std_features = np.std(data)
min_features = np.amin(data)
max_features = np.amax(data)
features = {'mean': mean_features, 'std': std_features, 'min': min_features, 'max': max_features,
'label': label}
return features
def get_net_accel(data):
return (data['ACC_x'] ** 2 + data['ACC_y'] ** 2 + data['ACC_z'] ** 2).apply(lambda x: np.sqrt(x))
def get_peak_freq(x):
f, Pxx = scisig.periodogram(x, fs=8)
psd_dict = {amp: freq for amp, freq in zip(Pxx, f)}
peak_freq = psd_dict[max(psd_dict.keys())]
return peak_freq
# https://github.com/MITMediaLabAffectiveComputing/eda-explorer/blob/master/AccelerometerFeatureExtractionScript.py
def filterSignalFIR(eda, cutoff=0.4, numtaps=64):
f = cutoff / (fs_dict['ACC'] / 2.0)
FIR_coeff = scisig.firwin(numtaps, f)
return scisig.lfilter(FIR_coeff, 1, eda)
Next code cell contains function which removes all non Emphatica4 data, keeps observations recorded only during three specific stages of the experiment and extracts new features out of collected data.
def compute_features(e4_data_dict, labels, norm_type=None):
# Dataframes for each sensor type
eda_df = pd.DataFrame(e4_data_dict['EDA'], columns=['EDA'])
bvp_df = pd.DataFrame(e4_data_dict['BVP'], columns=['BVP'])
acc_df = pd.DataFrame(e4_data_dict['ACC'], columns=['ACC_x', 'ACC_y', 'ACC_z'])
temp_df = pd.DataFrame(e4_data_dict['TEMP'], columns=['TEMP'])
label_df = pd.DataFrame(labels, columns=['label'])
resp_df = pd.DataFrame(e4_data_dict['Resp'], columns=['Resp'])
# Filter EDA
eda_df['EDA'] = butter_lowpass_filter(eda_df['EDA'], 1.0, fs_dict['EDA'], 6)
# Filter ACM
for _ in acc_df.columns:
acc_df[_] = filterSignalFIR(acc_df.values)
# Adding indices for combination due to differing sampling frequencies
eda_df.index = [(1 / fs_dict['EDA']) * i for i in range(len(eda_df))]
bvp_df.index = [(1 / fs_dict['BVP']) * i for i in range(len(bvp_df))]
acc_df.index = [(1 / fs_dict['ACC']) * i for i in range(len(acc_df))]
temp_df.index = [(1 / fs_dict['TEMP']) * i for i in range(len(temp_df))]
label_df.index = [(1 / fs_dict['label']) * i for i in range(len(label_df))]
resp_df.index = [(1 / fs_dict['Resp']) * i for i in range(len(resp_df))]
# print(eda_df)
# Change indices to datetime
eda_df.index = pd.to_datetime(eda_df.index, unit='s')
bvp_df.index = pd.to_datetime(bvp_df.index, unit='s')
temp_df.index = pd.to_datetime(temp_df.index, unit='s')
acc_df.index = pd.to_datetime(acc_df.index, unit='s')
label_df.index = pd.to_datetime(label_df.index, unit='s')
resp_df.index = pd.to_datetime(resp_df.index, unit='s')
# New EDA features
r, p, t, l, d, e, obj = eda_stats(eda_df['EDA'])
eda_df['EDA_phasic'] = r
eda_df['EDA_smna'] = p
eda_df['EDA_tonic'] = t
# Combined dataframe
df = eda_df.join(bvp_df, how='outer')
df = df.join(temp_df, how='outer')
df = df.join(acc_df, how='outer')
df = df.join(label_df, how='outer')
df['label'] = df['label'].fillna(method='bfill')
df.reset_index(drop=True, inplace=True)
df_ = df.drop(columns=['label'])
df_ = df_.dropna(how='all')
df_merged = df_.join(df['label'], how='left')
if norm_type == 'std':
# std norm
df_merged = (df_merged - df_merged.mean()) / df_merged.std()
elif norm_type == 'minmax':
# minmax norm
df_merged (df_merged - df_merged.min()) / (df_merged.max() - df_merged.min())
# Group by
grouped = df_merged.groupby('label')
baseline = grouped.get_group(1)
stress = grouped.get_group(2)
amusement = grouped.get_group(3)
return grouped, baseline, stress, amusement
The following function generates combines collected data based on the defined window lengths and intervals between windows.
def get_samples(data, label):
global feat_names
global WINDOW_LENGTH
global WINDOW_INTERVAL
samples = []
# Using label freq (64 Hz) as our reference frequency due to it being the largest
# and thus encompassing the lesser ones in its resolution.
window_len = fs_dict['BVP'] * WINDOW_LENGTH
window_int = fs_dict['BVP'] * WINDOW_INTERVAL
i = 0
data.head()
while (window_int * i + window_len) < len(data):
# Get window of data
w = data[window_int * i : window_int * i + window_len]
# Add/Calc rms acc
# w['net_acc'] = get_net_accel(w)
w = pd.concat([w, get_net_accel(w)], names=['acc_net'])
#w.columns = ['net_acc', 'ACC_x', 'ACC_y', 'ACC_z', 'BVP',
# 'EDA', 'EDA_phasic', 'EDA_smna', 'EDA_tonic', 'TEMP',
# 'label']
# print(w.head())
cols = list(w.columns)
cols[0] = 'net_acc'
w.columns = cols
# Calculate stats for window
wstats = get_window_stats(data=w, label=label)
# Seperating sample and label
x = pd.DataFrame(wstats).drop('label', axis=0)
y = x['label'][0]
x.drop('label', axis=1, inplace=True)
if feat_names is None:
feat_names = []
for row in x.index:
for col in x.columns:
feat_names.append('_'.join([str(row), str(col)]))
# sample df
wdf = pd.DataFrame(x.values.flatten()).T
wdf.columns = feat_names
wdf = pd.concat([wdf, pd.DataFrame({'label': y}, index=[0])], axis=1)
# More feats
wdf['BVP_peak_freq'] = get_peak_freq(w['BVP'].dropna())
wdf['TEMP_slope'] = get_slope(w['TEMP'].dropna())
samples.append(wdf)
i+=1
return pd.concat(samples)
Next function will pre-process data per subject.
def make_patient_data(subject_id):
global OUTPUT_FOLDER
global WINDOW_IN_SECONDS
# Make subject data object for Sx
subject = SubjectData(main_path=INPUT_FOLDER, subject_number=subject_id)
# Empatica E4 data - now with resp
e4_data_dict = subject.get_wrist_data()
# norm type
norm_type = None
# The 3 classes we are classifying
grouped, baseline, stress, amusement = compute_features(e4_data_dict, subject.labels, norm_type)
baseline_samples = get_samples(baseline, 1)
stress_samples = get_samples(stress, 2)
amusement_samples = get_samples(amusement, 0)
all_samples = pd.concat([baseline_samples, stress_samples, amusement_samples])
all_samples = pd.concat([all_samples.drop('label', axis=1), pd.get_dummies(all_samples['label'])], axis=1)
# Selected Features
# all_samples = all_samples[['EDA_mean', 'EDA_std', 'EDA_min', 'EDA_max',
# 'BVP_mean', 'BVP_std', 'BVP_min', 'BVP_max',
# 'TEMP_mean', 'TEMP_std', 'TEMP_min', 'TEMP_max',
# 'net_acc_mean', 'net_acc_std', 'net_acc_min', 'net_acc_max',
# 0, 1, 2]]
# Save file as csv (for now)
all_samples.to_csv(f'{OUTPUT_FOLDER}{subject_feature_path}/S{subject_id}_feats_4.csv')
gc.collect()
Last defined function is responsible for combining pre-processed data of all subjects into one file.
def combine_files(subjects):
df_list = []
for s in subjects:
df = pd.read_csv(f'{OUTPUT_FOLDER}{subject_feature_path}/S{s}_feats_4.csv', index_col=0)
df['subject'] = s
df_list.append(df)
df = pd.concat(df_list)
df['label'] = (df['0'].astype(str) + df['1'].astype(str) + df['2'].astype(str)).apply(lambda x: x.index('1'))
df.drop(['0', '1', '2'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_csv(f'{OUTPUT_FOLDER}/combined_subjects.csv')
counts = df['label'].value_counts()
print('Number of samples per class:')
for label, number in zip(counts.index, counts.values):
print(f'{int_to_label[label]}: {number}')
Pre-processing¶
In this step, we go through all 15 subjects and create .csv files with pre-processed data per each subject and one .csv file with combined data.
# -.-|m { input_fold: show, output: false }
subject_ids = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]
for patient in subject_ids:
print(f'Processing data for S{patient}...')
make_patient_data(patient)
combine_files(subject_ids)
print('Processing complete.')